import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('athlete_events.csv')
region_df = pd.read_csv('noc_regions.csv')
df.tail()
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Games | Year | Season | City | Sport | Event | Medal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 271111 | 135569 | Andrzej ya | M | 29.0 | 179.0 | 89.0 | Poland-1 | POL | 1976 Winter | 1976 | Winter | Innsbruck | Luge | Luge Mixed (Men)'s Doubles | NaN |
| 271112 | 135570 | Piotr ya | M | 27.0 | 176.0 | 59.0 | Poland | POL | 2014 Winter | 2014 | Winter | Sochi | Ski Jumping | Ski Jumping Men's Large Hill, Individual | NaN |
| 271113 | 135570 | Piotr ya | M | 27.0 | 176.0 | 59.0 | Poland | POL | 2014 Winter | 2014 | Winter | Sochi | Ski Jumping | Ski Jumping Men's Large Hill, Team | NaN |
| 271114 | 135571 | Tomasz Ireneusz ya | M | 30.0 | 185.0 | 96.0 | Poland | POL | 1998 Winter | 1998 | Winter | Nagano | Bobsleigh | Bobsleigh Men's Four | NaN |
| 271115 | 135571 | Tomasz Ireneusz ya | M | 34.0 | 185.0 | 96.0 | Poland | POL | 2002 Winter | 2002 | Winter | Salt Lake City | Bobsleigh | Bobsleigh Men's Four | NaN |
df.shape
(271116, 15)
#To FILTER summer season data only
df = df[df['Season']=='Summer']
df.shape
(222552, 15)
df.tail()
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Games | Year | Season | City | Sport | Event | Medal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 271106 | 135565 | Fernando scar Zylberberg | M | 27.0 | 168.0 | 76.0 | Argentina | ARG | 2004 Summer | 2004 | Summer | Athina | Hockey | Hockey Men's Hockey | NaN |
| 271107 | 135566 | James Francis "Jim" Zylker | M | 21.0 | 175.0 | 75.0 | United States | USA | 1972 Summer | 1972 | Summer | Munich | Football | Football Men's Football | NaN |
| 271108 | 135567 | Aleksandr Viktorovich Zyuzin | M | 24.0 | 183.0 | 72.0 | Russia | RUS | 2000 Summer | 2000 | Summer | Sydney | Rowing | Rowing Men's Lightweight Coxless Fours | NaN |
| 271109 | 135567 | Aleksandr Viktorovich Zyuzin | M | 28.0 | 183.0 | 72.0 | Russia | RUS | 2004 Summer | 2004 | Summer | Athina | Rowing | Rowing Men's Lightweight Coxless Fours | NaN |
| 271110 | 135568 | Olga Igorevna Zyuzkova | F | 33.0 | 171.0 | 69.0 | Belarus | BLR | 2016 Summer | 2016 | Summer | Rio de Janeiro | Basketball | Basketball Women's Basketball | NaN |
region_df.tail()
| NOC | region | notes | |
|---|---|---|---|
| 225 | YEM | Yemen | NaN |
| 226 | YMD | Yemen | South Yemen |
| 227 | YUG | Serbia | Yugoslavia |
| 228 | ZAM | Zambia | NaN |
| 229 | ZIM | Zimbabwe | NaN |
#to LEFT JOIN both tables
df = df.merge(region_df,on = 'NOC',how = 'left')
df.tail()
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Games | Year | Season | City | Sport | Event | Medal | region | notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 222547 | 135565 | Fernando scar Zylberberg | M | 27.0 | 168.0 | 76.0 | Argentina | ARG | 2004 Summer | 2004 | Summer | Athina | Hockey | Hockey Men's Hockey | NaN | Argentina | NaN |
| 222548 | 135566 | James Francis "Jim" Zylker | M | 21.0 | 175.0 | 75.0 | United States | USA | 1972 Summer | 1972 | Summer | Munich | Football | Football Men's Football | NaN | USA | NaN |
| 222549 | 135567 | Aleksandr Viktorovich Zyuzin | M | 24.0 | 183.0 | 72.0 | Russia | RUS | 2000 Summer | 2000 | Summer | Sydney | Rowing | Rowing Men's Lightweight Coxless Fours | NaN | Russia | NaN |
| 222550 | 135567 | Aleksandr Viktorovich Zyuzin | M | 28.0 | 183.0 | 72.0 | Russia | RUS | 2004 Summer | 2004 | Summer | Athina | Rowing | Rowing Men's Lightweight Coxless Fours | NaN | Russia | NaN |
| 222551 | 135568 | Olga Igorevna Zyuzkova | F | 33.0 | 171.0 | 69.0 | Belarus | BLR | 2016 Summer | 2016 | Summer | Rio de Janeiro | Basketball | Basketball Women's Basketball | NaN | Belarus | NaN |
#how many countries participated
df['region'].unique()
array(['China', 'Denmark', 'Netherlands', 'Finland', 'Norway', 'Romania',
'Estonia', 'France', 'Morocco', 'Spain', 'Egypt', 'Iran',
'Bulgaria', 'Italy', 'Chad', 'Azerbaijan', 'Sudan', 'Russia',
'Argentina', 'Cuba', 'Belarus', 'Greece', 'Cameroon', 'Turkey',
'Chile', 'Mexico', 'USA', 'Nicaragua', 'Hungary', 'Nigeria',
'Algeria', 'Kuwait', 'Bahrain', 'Pakistan', 'Iraq', 'Syria',
'Lebanon', 'Qatar', 'Malaysia', 'Germany', 'Canada', 'Ireland',
'Australia', 'South Africa', 'Eritrea', 'Tanzania', 'Jordan',
'Tunisia', 'Libya', 'Belgium', 'Djibouti', 'Palestine', 'Comoros',
'Kazakhstan', 'Brunei', 'India', 'Saudi Arabia', 'Maldives',
'Ethiopia', 'United Arab Emirates', 'Yemen', 'Indonesia',
'Philippines', nan, 'Uzbekistan', 'Kyrgyzstan', 'Tajikistan',
'Japan', 'Republic of Congo', 'Switzerland', 'Brazil', 'Monaco',
'Israel', 'Uruguay', 'Sweden', 'Sri Lanka', 'Armenia',
'Ivory Coast', 'Kenya', 'Benin', 'UK', 'Ghana', 'Somalia', 'Niger',
'Mali', 'Afghanistan', 'Poland', 'Costa Rica', 'Panama', 'Georgia',
'Slovenia', 'Guyana', 'New Zealand', 'Portugal', 'Paraguay',
'Angola', 'Venezuela', 'Colombia', 'Bangladesh', 'Peru',
'El Salvador', 'Puerto Rico', 'Uganda', 'Honduras', 'Ecuador',
'Turkmenistan', 'Mauritius', 'Seychelles', 'Czech Republic',
'Luxembourg', 'Mauritania', 'Saint Kitts', 'Trinidad',
'Dominican Republic', 'Saint Vincent', 'Jamaica', 'Liberia',
'Suriname', 'Nepal', 'Mongolia', 'Austria', 'Palau', 'Lithuania',
'Togo', 'Namibia', 'Curacao', 'Ukraine', 'Iceland',
'American Samoa', 'Samoa', 'Rwanda', 'Croatia', 'Dominica',
'Haiti', 'Malta', 'Cyprus', 'Guinea', 'Belize', 'Thailand',
'Bermuda', 'Serbia', 'Sierra Leone', 'Papua New Guinea',
'Individual Olympic Athletes', 'Oman', 'Fiji', 'Vanuatu',
'Moldova', 'Bahamas', 'Guatemala', 'Latvia',
'Virgin Islands, British', 'Mozambique', 'Virgin Islands, US',
'Central African Republic', 'Madagascar', 'Bosnia and Herzegovina',
'Guam', 'Cayman Islands', 'Slovakia', 'Barbados', 'Guinea-Bissau',
'Timor-Leste', 'Democratic Republic of the Congo', 'Gabon',
'San Marino', 'Laos', 'Botswana', 'South Korea', 'Cambodia',
'North Korea', 'Solomon Islands', 'Senegal', 'Cape Verde',
'Equatorial Guinea', 'Boliva', 'Antigua', 'Andorra', 'Zimbabwe',
'Grenada', 'Saint Lucia', 'Micronesia', 'Myanmar', 'Malawi',
'Zambia', 'Taiwan', 'Sao Tome and Principe', 'Macedonia',
'Liechtenstein', 'Montenegro', 'Gambia', 'Cook Islands', 'Albania',
'Swaziland', 'Burkina Faso', 'Burundi', 'Aruba', 'Nauru',
'Vietnam', 'Bhutan', 'Marshall Islands', 'Kiribati', 'Tonga',
'Kosovo', 'South Sudan', 'Lesotho'], dtype=object)
df['region'].unique().shape
(206,)
df.isnull().sum()
ID 0 Name 0 Sex 0 Age 9189 Height 51857 Weight 53854 Team 0 NOC 0 Games 0 Year 0 Season 0 City 0 Sport 0 Event 0 Medal 188464 region 370 notes 218151 dtype: int64
df.duplicated().sum()
1385
df.drop_duplicates(inplace = True)
df.duplicated().sum()
0
df['Medal'].value_counts()
Gold 11456 Bronze 11409 Silver 11212 Name: Medal, dtype: int64
pd.get_dummies(df['Medal'])
| Bronze | Gold | Silver | |
|---|---|---|---|
| 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 |
| 3 | 0 | 1 | 0 |
| 4 | 0 | 0 | 0 |
| ... | ... | ... | ... |
| 222547 | 0 | 0 | 0 |
| 222548 | 0 | 0 | 0 |
| 222549 | 0 | 0 | 0 |
| 222550 | 0 | 0 | 0 |
| 222551 | 0 | 0 | 0 |
221167 rows × 3 columns
df = pd.concat([df,pd.get_dummies(df['Medal'])],axis = 1) #concat horizontally
df.groupby('NOC').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending = False).reset_index()
| NOC | Gold | Silver | Bronze | |
|---|---|---|---|---|
| 0 | USA | 2472.0 | 1333.0 | 1197.0 |
| 1 | URS | 832.0 | 635.0 | 596.0 |
| 2 | GBR | 635.0 | 729.0 | 620.0 |
| 3 | GER | 592.0 | 538.0 | 649.0 |
| 4 | ITA | 518.0 | 474.0 | 454.0 |
| ... | ... | ... | ... | ... |
| 225 | AHO | 0.0 | 1.0 | 0.0 |
| 226 | LBR | 0.0 | 0.0 | 0.0 |
| 227 | LCA | 0.0 | 0.0 | 0.0 |
| 228 | LES | 0.0 | 0.0 | 0.0 |
| 229 | LBA | 0.0 | 0.0 | 0.0 |
230 rows × 4 columns
df.tail()
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Games | Year | Season | City | Sport | Event | Medal | region | notes | Bronze | Gold | Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 222547 | 135565 | Fernando scar Zylberberg | M | 27.0 | 168.0 | 76.0 | Argentina | ARG | 2004 Summer | 2004 | Summer | Athina | Hockey | Hockey Men's Hockey | NaN | Argentina | NaN | 0 | 0 | 0 |
| 222548 | 135566 | James Francis "Jim" Zylker | M | 21.0 | 175.0 | 75.0 | United States | USA | 1972 Summer | 1972 | Summer | Munich | Football | Football Men's Football | NaN | USA | NaN | 0 | 0 | 0 |
| 222549 | 135567 | Aleksandr Viktorovich Zyuzin | M | 24.0 | 183.0 | 72.0 | Russia | RUS | 2000 Summer | 2000 | Summer | Sydney | Rowing | Rowing Men's Lightweight Coxless Fours | NaN | Russia | NaN | 0 | 0 | 0 |
| 222550 | 135567 | Aleksandr Viktorovich Zyuzin | M | 28.0 | 183.0 | 72.0 | Russia | RUS | 2004 Summer | 2004 | Summer | Athina | Rowing | Rowing Men's Lightweight Coxless Fours | NaN | Russia | NaN | 0 | 0 | 0 |
| 222551 | 135568 | Olga Igorevna Zyuzkova | F | 33.0 | 171.0 | 69.0 | Belarus | BLR | 2016 Summer | 2016 | Summer | Rio de Janeiro | Basketball | Basketball Women's Basketball | NaN | Belarus | NaN | 0 | 0 | 0 |
#to get rows where India won gold medal
df[(df['NOC']=='IND')&(df['Medal']=='Gold')].shape
(131, 20)
#so drop duplicates to count 1 medal for 1 team instead of each player
medal_tally = df.drop_duplicates(subset=['NOC','Games','Sport','Event','Medal'])
medal_tally.tail()
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Games | Year | Season | City | Sport | Event | Medal | region | notes | Bronze | Gold | Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 222528 | 135553 | Galina Ivanovna Zybina (-Fyodorova) | F | 25.0 | 168.0 | 80.0 | Soviet Union | URS | 1956 Summer | 1956 | Summer | Melbourne | Athletics | Athletics Women's Shot Put | Silver | Russia | NaN | 0 | 0 | 1 |
| 222530 | 135553 | Galina Ivanovna Zybina (-Fyodorova) | F | 33.0 | 168.0 | 80.0 | Soviet Union | URS | 1964 Summer | 1964 | Summer | Tokyo | Athletics | Athletics Women's Shot Put | Bronze | Russia | NaN | 1 | 0 | 0 |
| 222536 | 135556 | Bogusaw Stanisaw Zychowicz | M | 19.0 | 189.0 | 80.0 | Poland | POL | 1980 Summer | 1980 | Summer | Moskva | Swimming | Swimming Men's 100 metres Butterfly | NaN | Poland | NaN | 0 | 0 | 0 |
| 222537 | 135556 | Bogusaw Stanisaw Zychowicz | M | 19.0 | 189.0 | 80.0 | Poland | POL | 1980 Summer | 1980 | Summer | Moskva | Swimming | Swimming Men's 200 metres Butterfly | NaN | Poland | NaN | 0 | 0 | 0 |
| 222541 | 135560 | Stavroula Zygouri | F | 36.0 | 171.0 | 63.0 | Greece | GRE | 2004 Summer | 2004 | Summer | Athina | Wrestling | Wrestling Women's Middleweight, Freestyle | NaN | Greece | NaN | 0 | 0 | 0 |
medal_tally = medal_tally.groupby('NOC').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending = False).reset_index()
medal_tally.head()
| NOC | Gold | Silver | Bronze | |
|---|---|---|---|---|
| 0 | USA | 1035.0 | 802.0 | 707.0 |
| 1 | URS | 394.0 | 317.0 | 294.0 |
| 2 | GBR | 278.0 | 316.0 | 298.0 |
| 3 | GER | 233.0 | 261.0 | 282.0 |
| 4 | FRA | 233.0 | 255.0 | 282.0 |
medal_tally[medal_tally['NOC']=='IND']
| NOC | Gold | Silver | Bronze | |
|---|---|---|---|---|
| 53 | IND | 9.0 | 7.0 | 12.0 |
medal_tally['total'] = medal_tally['Gold']+medal_tally['Silver']+medal_tally['Bronze']
medal_tally.head()
| NOC | Gold | Silver | Bronze | total | |
|---|---|---|---|---|---|
| 0 | USA | 1035.0 | 802.0 | 707.0 | 2544.0 |
| 1 | URS | 394.0 | 317.0 | 294.0 | 1005.0 |
| 2 | GBR | 278.0 | 316.0 | 298.0 | 892.0 |
| 3 | GER | 233.0 | 261.0 | 282.0 | 776.0 |
| 4 | FRA | 233.0 | 255.0 | 282.0 | 770.0 |
years = df['Year'].unique().tolist()
years.sort()
years
[1896, 1900, 1904, 1906, 1908, 1912, 1920, 1924, 1928, 1932, 1936, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016]
years.insert(0,'Overall')
years
['Overall', 1896, 1900, 1904, 1906, 1908, 1912, 1920, 1924, 1928, 1932, 1936, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016]
country=df['region'].dropna().unique().tolist()
country.sort()
country
['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Antigua', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Boliva', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic', 'Democratic Republic of the Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guam', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Individual Olympic Athletes', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Ivory Coast', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'Kosovo', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macedonia', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius', 'Mexico', 'Micronesia', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'North Korea', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestine', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Republic of Congo', 'Romania', 'Russia', 'Rwanda', 'Saint Kitts', 'Saint Lucia', 'Saint Vincent', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Korea', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Swaziland', 'Sweden', 'Switzerland', 'Syria', 'Taiwan', 'Tajikistan', 'Tanzania', 'Thailand', 'Timor-Leste', 'Togo', 'Tonga', 'Trinidad', 'Tunisia', 'Turkey', 'Turkmenistan', 'UK', 'USA', 'Uganda', 'Ukraine', 'United Arab Emirates', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela', 'Vietnam', 'Virgin Islands, British', 'Virgin Islands, US', 'Yemen', 'Zambia', 'Zimbabwe']
country.insert(0,'Overall')
medal_tally1 = df.drop_duplicates(subset=['NOC','Games','Sport','Event','Medal'])
country
['Overall', 'Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Antigua', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Boliva', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic', 'Democratic Republic of the Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guam', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Individual Olympic Athletes', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Ivory Coast', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'Kosovo', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macedonia', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius', 'Mexico', 'Micronesia', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'North Korea', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestine', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Republic of Congo', 'Romania', 'Russia', 'Rwanda', 'Saint Kitts', 'Saint Lucia', 'Saint Vincent', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Korea', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Swaziland', 'Sweden', 'Switzerland', 'Syria', 'Taiwan', 'Tajikistan', 'Tanzania', 'Thailand', 'Timor-Leste', 'Togo', 'Tonga', 'Trinidad', 'Tunisia', 'Turkey', 'Turkmenistan', 'UK', 'USA', 'Uganda', 'Ukraine', 'United Arab Emirates', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela', 'Vietnam', 'Virgin Islands, British', 'Virgin Islands, US', 'Yemen', 'Zambia', 'Zimbabwe']
def fetch_medal_tally(df,year,country):
medal_tally1 = df.drop_duplicates(subset=['NOC','Games','Sport','Event','Medal'])
flag = 0
if year == 'Overall' and country == 'Overall':
temp_df = medal_tally1
if year == 'Overall' and country != 'Overall':
flag = 1
temp_df = medal_tally1[medal_tally1['region']==country]
if year != 'Overall' and country == 'Overall':
temp_df = medal_tally1[medal_tally1['Year']==year]
if year != 'Overall' and country != 'Overall':
temp_df = medal_tally1[(medal_tally1['region']==country) & (medal_tally1['Year']==int(year))]
if(flag==1): #we want to check performance of a country in every year olympic
x = temp_df.groupby('Year').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending = False).reset_index()
else:
x = temp_df.groupby('region').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending = False).reset_index()
x['Total']=x['Gold']+x['Silver']+x['Bronze']
return x
fetch_medal_tally(df,1992,'Overall')
| region | Gold | Silver | Bronze | Total | |
|---|---|---|---|---|---|
| 0 | Russia | 45 | 38 | 29 | 112 |
| 1 | USA | 37 | 34 | 37 | 108 |
| 2 | Germany | 33 | 21 | 28 | 82 |
| 3 | China | 16 | 22 | 15 | 53 |
| 4 | Cuba | 14 | 6 | 11 | 31 |
| ... | ... | ... | ... | ... | ... |
| 162 | Haiti | 0 | 0 | 0 | 0 |
| 163 | Honduras | 0 | 0 | 0 | 0 |
| 164 | Iceland | 0 | 0 | 0 | 0 |
| 165 | India | 0 | 0 | 0 | 0 |
| 166 | Zimbabwe | 0 | 0 | 0 | 0 |
167 rows × 5 columns
medal_tally1
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Games | Year | Season | City | Sport | Event | Medal | region | notes | Bronze | Gold | Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | A Dijiang | M | 24.0 | 180.0 | 80.0 | China | CHN | 1992 Summer | 1992 | Summer | Barcelona | Basketball | Basketball Men's Basketball | NaN | China | NaN | 0 | 0 | 0 |
| 1 | 2 | A Lamusi | M | 23.0 | 170.0 | 60.0 | China | CHN | 2012 Summer | 2012 | Summer | London | Judo | Judo Men's Extra-Lightweight | NaN | China | NaN | 0 | 0 | 0 |
| 2 | 3 | Gunnar Nielsen Aaby | M | 24.0 | NaN | NaN | Denmark | DEN | 1920 Summer | 1920 | Summer | Antwerpen | Football | Football Men's Football | NaN | Denmark | NaN | 0 | 0 | 0 |
| 3 | 4 | Edgar Lindenau Aabye | M | 34.0 | NaN | NaN | Denmark/Sweden | DEN | 1900 Summer | 1900 | Summer | Paris | Tug-Of-War | Tug-Of-War Men's Tug-Of-War | Gold | Denmark | NaN | 0 | 1 | 0 |
| 4 | 8 | Cornelia "Cor" Aalten (-Strannood) | F | 18.0 | 168.0 | NaN | Netherlands | NED | 1932 Summer | 1932 | Summer | Los Angeles | Athletics | Athletics Women's 100 metres | NaN | Netherlands | NaN | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 222528 | 135553 | Galina Ivanovna Zybina (-Fyodorova) | F | 25.0 | 168.0 | 80.0 | Soviet Union | URS | 1956 Summer | 1956 | Summer | Melbourne | Athletics | Athletics Women's Shot Put | Silver | Russia | NaN | 0 | 0 | 1 |
| 222530 | 135553 | Galina Ivanovna Zybina (-Fyodorova) | F | 33.0 | 168.0 | 80.0 | Soviet Union | URS | 1964 Summer | 1964 | Summer | Tokyo | Athletics | Athletics Women's Shot Put | Bronze | Russia | NaN | 1 | 0 | 0 |
| 222536 | 135556 | Bogusaw Stanisaw Zychowicz | M | 19.0 | 189.0 | 80.0 | Poland | POL | 1980 Summer | 1980 | Summer | Moskva | Swimming | Swimming Men's 100 metres Butterfly | NaN | Poland | NaN | 0 | 0 | 0 |
| 222537 | 135556 | Bogusaw Stanisaw Zychowicz | M | 19.0 | 189.0 | 80.0 | Poland | POL | 1980 Summer | 1980 | Summer | Moskva | Swimming | Swimming Men's 200 metres Butterfly | NaN | Poland | NaN | 0 | 0 | 0 |
| 222541 | 135560 | Stavroula Zygouri | F | 36.0 | 171.0 | 63.0 | Greece | GRE | 2004 Summer | 2004 | Summer | Athina | Wrestling | Wrestling Women's Middleweight, Freestyle | NaN | Greece | NaN | 0 | 0 | 0 |
105168 rows × 20 columns
fetch_medal_tally(df,'Overall','Norway')
| Year | Gold | Silver | Bronze | Total | |
|---|---|---|---|---|---|
| 0 | 1920 | 10 | 10 | 9 | 29 |
| 1 | 1924 | 5 | 2 | 3 | 10 |
| 2 | 2004 | 5 | 0 | 1 | 6 |
| 3 | 2000 | 4 | 3 | 3 | 10 |
| 4 | 1912 | 4 | 1 | 5 | 10 |
| 5 | 1952 | 3 | 2 | 0 | 5 |
| 6 | 2008 | 3 | 5 | 1 | 9 |
| 7 | 1992 | 2 | 4 | 1 | 7 |
| 8 | 1988 | 2 | 3 | 0 | 5 |
| 9 | 1972 | 2 | 1 | 1 | 4 |
| 10 | 1904 | 2 | 0 | 0 | 2 |
| 11 | 1996 | 2 | 2 | 3 | 7 |
| 12 | 2012 | 2 | 1 | 1 | 4 |
| 13 | 1908 | 2 | 3 | 3 | 8 |
| 14 | 1956 | 1 | 0 | 2 | 3 |
| 15 | 1948 | 1 | 3 | 3 | 7 |
| 16 | 1936 | 1 | 3 | 2 | 6 |
| 17 | 1968 | 1 | 1 | 0 | 2 |
| 18 | 1976 | 1 | 1 | 0 | 2 |
| 19 | 1928 | 1 | 2 | 1 | 4 |
| 20 | 1906 | 1 | 1 | 0 | 2 |
| 21 | 1960 | 1 | 0 | 0 | 1 |
| 22 | 1900 | 0 | 2 | 3 | 5 |
| 23 | 1984 | 0 | 1 | 2 | 3 |
| 24 | 1964 | 0 | 0 | 0 | 0 |
| 25 | 1932 | 0 | 0 | 0 | 0 |
| 26 | 2016 | 0 | 0 | 4 | 4 |
fetch_medal_tally(df,'Overall','Overall')
| region | Gold | Silver | Bronze | Total | |
|---|---|---|---|---|---|
| 0 | USA | 1035.0 | 802.0 | 707.0 | 2544.0 |
| 1 | Russia | 592.0 | 498.0 | 487.0 | 1577.0 |
| 2 | Germany | 442.0 | 457.0 | 490.0 | 1389.0 |
| 3 | UK | 278.0 | 316.0 | 298.0 | 892.0 |
| 4 | France | 233.0 | 255.0 | 282.0 | 770.0 |
| ... | ... | ... | ... | ... | ... |
| 200 | Lesotho | 0.0 | 0.0 | 0.0 | 0.0 |
| 201 | Albania | 0.0 | 0.0 | 0.0 | 0.0 |
| 202 | Libya | 0.0 | 0.0 | 0.0 | 0.0 |
| 203 | Liechtenstein | 0.0 | 0.0 | 0.0 | 0.0 |
| 204 | Liberia | 0.0 | 0.0 | 0.0 | 0.0 |
205 rows × 5 columns
fetch_medal_tally(df,2016,'USA')
| region | Gold | Silver | Bronze | Total | |
|---|---|---|---|---|---|
| 0 | USA | 46 | 37 | 38 | 121 |
#Overall Analysis
#No. of editions, cities, events/sports, athletes, participating nations
df.head()
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Games | Year | Season | City | Sport | Event | Medal | region | notes | Bronze | Gold | Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | A Dijiang | M | 24.0 | 180.0 | 80.0 | China | CHN | 1992 Summer | 1992 | Summer | Barcelona | Basketball | Basketball Men's Basketball | NaN | China | NaN | 0 | 0 | 0 |
| 1 | 2 | A Lamusi | M | 23.0 | 170.0 | 60.0 | China | CHN | 2012 Summer | 2012 | Summer | London | Judo | Judo Men's Extra-Lightweight | NaN | China | NaN | 0 | 0 | 0 |
| 2 | 3 | Gunnar Nielsen Aaby | M | 24.0 | NaN | NaN | Denmark | DEN | 1920 Summer | 1920 | Summer | Antwerpen | Football | Football Men's Football | NaN | Denmark | NaN | 0 | 0 | 0 |
| 3 | 4 | Edgar Lindenau Aabye | M | 34.0 | NaN | NaN | Denmark/Sweden | DEN | 1900 Summer | 1900 | Summer | Paris | Tug-Of-War | Tug-Of-War Men's Tug-Of-War | Gold | Denmark | NaN | 0 | 1 | 0 |
| 4 | 8 | Cornelia "Cor" Aalten (-Strannood) | F | 18.0 | 168.0 | NaN | Netherlands | NED | 1932 Summer | 1932 | Summer | Los Angeles | Athletics | Athletics Women's 100 metres | NaN | Netherlands | NaN | 0 | 0 | 0 |
df['Year'].unique().shape[0]-1 #total number of olympics
28
df['City'].unique().shape
(23,)
df['Sport'].unique().shape
(52,)
df['Event'].unique().shape
(651,)
df['Name'].unique().shape
(116122,)
df['region'].unique().shape
(206,)
df.drop_duplicates(subset=['Year','region'])
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Games | Year | Season | City | Sport | Event | Medal | region | notes | Bronze | Gold | Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | A Dijiang | M | 24.0 | 180.0 | 80.0 | China | CHN | 1992 Summer | 1992 | Summer | Barcelona | Basketball | Basketball Men's Basketball | NaN | China | NaN | 0 | 0 | 0 |
| 1 | 2 | A Lamusi | M | 23.0 | 170.0 | 60.0 | China | CHN | 2012 Summer | 2012 | Summer | London | Judo | Judo Men's Extra-Lightweight | NaN | China | NaN | 0 | 0 | 0 |
| 2 | 3 | Gunnar Nielsen Aaby | M | 24.0 | NaN | NaN | Denmark | DEN | 1920 Summer | 1920 | Summer | Antwerpen | Football | Football Men's Football | NaN | Denmark | NaN | 0 | 0 | 0 |
| 3 | 4 | Edgar Lindenau Aabye | M | 34.0 | NaN | NaN | Denmark/Sweden | DEN | 1900 Summer | 1900 | Summer | Paris | Tug-Of-War | Tug-Of-War Men's Tug-Of-War | Gold | Denmark | NaN | 0 | 1 | 0 |
| 4 | 8 | Cornelia "Cor" Aalten (-Strannood) | F | 18.0 | 168.0 | NaN | Netherlands | NED | 1932 Summer | 1932 | Summer | Los Angeles | Athletics | Athletics Women's 100 metres | NaN | Netherlands | NaN | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 198805 | 121320 | Horatio Tertuliano Torrom | M | NaN | NaN | NaN | Argentina | ARG | 1908 Summer | 1908 | Summer | London | Figure Skating | Figure Skating Men's Singles | NaN | Argentina | NaN | 0 | 0 | 0 |
| 203075 | 123858 | Lloyd Oscar Valberg | M | 26.0 | NaN | NaN | Singapore | SGP | 1948 Summer | 1948 | Summer | London | Athletics | Athletics Men's High Jump | NaN | NaN | NaN | 0 | 0 | 0 |
| 209763 | 127894 | Bruno Julius Wagner | M | 25.0 | 185.0 | 82.0 | Switzerland | SUI | 1908 Summer | 1908 | Summer | London | Athletics | Athletics Men's Hammer Throw | NaN | Switzerland | NaN | 0 | 0 | 0 |
| 215352 | 131186 | Antoni Wiwulski | M | 35.0 | NaN | NaN | Poland | POL | 1912 Summer | 1912 | Summer | Stockholm | Art Competitions | Art Competitions Mixed Sculpturing | NaN | Poland | NaN | 0 | 0 | 0 |
| 216735 | 132030 | Mariana Dias Ximenes | F | 24.0 | 151.0 | 48.0 | Timor Leste | TLS | 2008 Summer | 2008 | Summer | Beijing | Athletics | Athletics Women's Marathon | NaN | Timor-Leste | NaN | 0 | 0 | 0 |
2786 rows × 20 columns
nations_overtime = df.drop_duplicates(subset=['Year','region'])['Year'].value_counts().sort_values().reset_index()
nations_overtime.rename(columns = {'index' : 'Edition','Year' : 'Number of Countries'},inplace=True)
#draw line graph
import plotly.express as px
fig = px.line(nations_overtime , x = 'Edition' , y = 'Number of Countries')
fig.show()
#number of events over years
events_overtime = df.drop_duplicates(subset=['Year','Event'])['Year'].value_counts().sort_values().reset_index()
events_overtime.head()
| index | Year | |
|---|---|---|
| 0 | 1896 | 43 |
| 1 | 1906 | 74 |
| 2 | 1900 | 90 |
| 3 | 1904 | 95 |
| 4 | 1912 | 107 |
events_overtime.rename(columns={'index':'Edition','Year':'Number of Events'},inplace=True)
events_overtime
| Edition | Number of Events | |
|---|---|---|
| 0 | 1896 | 43 |
| 1 | 1906 | 74 |
| 2 | 1900 | 90 |
| 3 | 1904 | 95 |
| 4 | 1912 | 107 |
| 5 | 1908 | 109 |
| 6 | 1928 | 122 |
| 7 | 1932 | 131 |
| 8 | 1924 | 131 |
| 9 | 1952 | 149 |
| 10 | 1936 | 150 |
| 11 | 1960 | 150 |
| 12 | 1956 | 151 |
| 13 | 1948 | 153 |
| 14 | 1920 | 158 |
| 15 | 1964 | 163 |
| 16 | 1968 | 172 |
| 17 | 1972 | 193 |
| 18 | 1976 | 198 |
| 19 | 1980 | 203 |
| 20 | 1984 | 221 |
| 21 | 1988 | 237 |
| 22 | 1992 | 257 |
| 23 | 1996 | 271 |
| 24 | 2000 | 300 |
| 25 | 2004 | 301 |
| 26 | 2012 | 302 |
| 27 | 2008 | 302 |
| 28 | 2016 | 306 |
fig2 = px.line(events_overtime,x='Edition',y='Number of Events')
fig2.show()
import seaborn as sns
df.head()
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Games | Year | Season | City | Sport | Event | Medal | region | notes | Bronze | Gold | Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | A Dijiang | M | 24.0 | 180.0 | 80.0 | China | CHN | 1992 Summer | 1992 | Summer | Barcelona | Basketball | Basketball Men's Basketball | NaN | China | NaN | 0 | 0 | 0 |
| 1 | 2 | A Lamusi | M | 23.0 | 170.0 | 60.0 | China | CHN | 2012 Summer | 2012 | Summer | London | Judo | Judo Men's Extra-Lightweight | NaN | China | NaN | 0 | 0 | 0 |
| 2 | 3 | Gunnar Nielsen Aaby | M | 24.0 | NaN | NaN | Denmark | DEN | 1920 Summer | 1920 | Summer | Antwerpen | Football | Football Men's Football | NaN | Denmark | NaN | 0 | 0 | 0 |
| 3 | 4 | Edgar Lindenau Aabye | M | 34.0 | NaN | NaN | Denmark/Sweden | DEN | 1900 Summer | 1900 | Summer | Paris | Tug-Of-War | Tug-Of-War Men's Tug-Of-War | Gold | Denmark | NaN | 0 | 1 | 0 |
| 4 | 8 | Cornelia "Cor" Aalten (-Strannood) | F | 18.0 | 168.0 | NaN | Netherlands | NED | 1932 Summer | 1932 | Summer | Los Angeles | Athletics | Athletics Women's 100 metres | NaN | Netherlands | NaN | 0 | 0 | 0 |
temp_df = df.dropna(subset=['Medal'])
temp_df['Name'].value_counts().reset_index().merge(df,left_on='index',right_on='Name',how='left')
| index | Name_x | ID | Name_y | Sex | Age | Height | Weight | Team | NOC | ... | Season | City | Sport | Event | Medal | region | notes | Bronze | Gold | Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Michael Fred Phelps, II | 28 | 94406 | Michael Fred Phelps, II | M | 15.0 | 193.0 | 91.0 | United States | USA | ... | Summer | Sydney | Swimming | Swimming Men's 200 metres Butterfly | NaN | USA | NaN | 0 | 0 | 0 |
| 1 | Michael Fred Phelps, II | 28 | 94406 | Michael Fred Phelps, II | M | 19.0 | 193.0 | 91.0 | United States | USA | ... | Summer | Athina | Swimming | Swimming Men's 200 metres Freestyle | Bronze | USA | NaN | 1 | 0 | 0 |
| 2 | Michael Fred Phelps, II | 28 | 94406 | Michael Fred Phelps, II | M | 19.0 | 193.0 | 91.0 | United States | USA | ... | Summer | Athina | Swimming | Swimming Men's 4 x 100 metres Freestyle Relay | Bronze | USA | NaN | 1 | 0 | 0 |
| 3 | Michael Fred Phelps, II | 28 | 94406 | Michael Fred Phelps, II | M | 19.0 | 193.0 | 91.0 | United States | USA | ... | Summer | Athina | Swimming | Swimming Men's 4 x 200 metres Freestyle Relay | Gold | USA | NaN | 0 | 1 | 0 |
| 4 | Michael Fred Phelps, II | 28 | 94406 | Michael Fred Phelps, II | M | 19.0 | 193.0 | 91.0 | United States | USA | ... | Summer | Athina | Swimming | Swimming Men's 100 metres Butterfly | Gold | USA | NaN | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 63088 | Preben Isaksson | 1 | 52433 | Preben Isaksson | M | 21.0 | 183.0 | 73.0 | Denmark | DEN | ... | Summer | Tokyo | Cycling | Cycling Men's Individual Pursuit, 4,000 metres | Bronze | Denmark | NaN | 1 | 0 | 0 |
| 63089 | Preben Isaksson | 1 | 52433 | Preben Isaksson | M | 21.0 | 183.0 | 73.0 | Denmark | DEN | ... | Summer | Tokyo | Cycling | Cycling Men's Team Pursuit, 4,000 metres | NaN | Denmark | NaN | 0 | 0 | 0 |
| 63090 | rpd Lengyel | 1 | 68707 | rpd Lengyel | M | 20.0 | NaN | NaN | Hungary | HUN | ... | Summer | Berlin | Swimming | Swimming Men's 400 metres Freestyle | NaN | Hungary | NaN | 0 | 0 | 0 |
| 63091 | rpd Lengyel | 1 | 68707 | rpd Lengyel | M | 20.0 | NaN | NaN | Hungary | HUN | ... | Summer | Berlin | Swimming | Swimming Men's 4 x 200 metres Freestyle Relay | Bronze | Hungary | NaN | 1 | 0 | 0 |
| 63092 | rpd Lengyel | 1 | 68707 | rpd Lengyel | M | 20.0 | NaN | NaN | Hungary | HUN | ... | Summer | Berlin | Swimming | Swimming Men's 100 metres Backstroke | NaN | Hungary | NaN | 0 | 0 | 0 |
63093 rows × 22 columns
sns.heatmap(df.pivot_table(index='Sport',column='Year',values='Event',aggfunc='count')).fillna(0).astype('int'),annot=True)
Input In [65] sns.heatmap(df.pivot_table(index='Sport',column='Year',values='Event',aggfunc='count')).fillna(0).astype('int'),annot=True) ^ SyntaxError: cannot assign to function call
x =df.drop_duplicates(subset=['Year','Sport','Event'])
x
x.pivot_table(index='Sport',columns='Year',values = 'Event',aggfunc='count').fillna(0).astype('int')
import seaborn as sns
plt.figure(figsize=(25,25))
sns.heatmap(x.pivot_table(index='Sport',columns='Year',values = 'Event',aggfunc='count').fillna(0).astype('int'),annot=True)
#left_on: Columns from the left DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame
#right_on: Columns from the right DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame
#.reset_index() converts it into a dataframe and dataframes can be merged using pandas
def most_successful(df,sport):
temp_df = df.dropna(subset=['Medal']) #filter those players with Nan/zero medals
if(sport != 'Overall'):
temp_df = temp_df[temp_df['Sport']==sport] #shows only names with the given sport
x = temp_df['Name'].value_counts().reset_index().head(15).merge(df,left_on='index',right_on='Name',how='left')[['index','Name_x','Sport','region']].drop_duplicates('index')
#means here df is right table and temp_df is left table and for merging take index of temp_df and Name of df
x.rename(columns={'index':'Name','Name_x':'Number of Medals'},inplace=True)
return x
most_successful(df,'Overall')
#CountryWise
#Countrywise medal tally per year line plot
#WHat countries are good at heatmap
#Most Successful Athletes Top 10
df
temp_df = df.dropna(subset=['Medal'])
temp_df.drop_duplicates(subset=['Team','Games','Medal','region','NOC','Event','Sport'],inplace=True)
new_df = temp_df[temp_df['region']=='India']
final_df = new_df.groupby('Year').count()['Medal'].reset_index()
final_df
fig = px.line(final_df,x='Year',y='Medal')
fig.show()
temp_df = df.dropna(subset=['Medal'])
temp_df.drop_duplicates(subset=['Team','Games','Medal','region','NOC','Event','Sport'],inplace=True)
new_df = temp_df[temp_df['region']=='India']
plt.figure(figsize=(20,20))
sns.heatmap(new_df.pivot_table(index='Sport',columns = 'Year', values = 'Medal' , aggfunc = 'count').fillna(0),annot = True)
def most_successful_countrywise(df,country):
temp_df = df.dropna(subset=['Medal']) #filter those players with Nan/zero medals
temp_df = temp_df[temp_df['region']==country] #shows only names with the given sport
x = temp_df['Name'].value_counts().reset_index().head(10).merge(df,left_on='index',right_on='Name',how='left')[['index','Name_x','Sport','region']].drop_duplicates('index')
#means here df is right table and temp_df is left table and for merging take index of temp_df and Name of df
#here name value count means this name or athlete has got medals how many times means how many medals he has got that much successful he is
x.rename(columns={'index':'Name','Name_x':'Number of Medals'},inplace=True)
return x
most_successful_countrywise(df,'USA')
#Athlete Analysis
import plotly.figure_factory as ff
athlete_df = df.drop_duplicates(subset = ['Name','region'])
x1 = athlete_df['Age'].dropna()
x2=athlete_df[athlete_df['Medal']=='Gold']['Age'].dropna()
x3=athlete_df[athlete_df['Medal']=='Silver']['Age'].dropna()
x4=athlete_df[athlete_df['Medal']=='Bronze']['Age'].dropna()
fig=ff.create_distplot([x1,x2,x3,x4],['Overall Age ','Gold Medalist','Silver Medalist','Bronze Medalist'])
fig.show()
fig=ff.create_distplot([x1,x2,x3,x4],['Overall Age ','Gold Medalist','Silver Medalist','Bronze Medalist'],show_hist=False,show_rug=False)
fig.show()
famous_sports = ['Basketball'
, 'Judo'
, 'Football'
, 'Tug-Of-War'
, 'Athletics'
, 'Hockey'
, 'Rowing'
, 'Golf',
'Sailing','Polo','Shooting'
]
df['Sport']
x=[]
name=[]
for i in famous_sports:
temp_df = athlete_df[athlete_df['Sport']==i]
x.append(temp_df[temp_df['Medal']=='Gold']['Age'].dropna())
name.append(i)
fig = ff.create_distplot(x,name,show_hist=False,show_rug=False)
fig.show()
athlete_df
athlete_df['Medal'].fillna('No Medal',inplace = True)
plt.figure(figsize=(10,10))#used to increase size of the plot in next line
temp_df = athlete_df[athlete_df['Sport']=='Athletics']
sns.scatterplot(temp_df['Height'],temp_df['Weight'],hue=temp_df['Medal'],style=temp_df['Sex'],s=100)
#style means style of point and s means size of each marker/point
men = athlete_df[athlete_df['Sex']=='M'].groupby('Year').count()#['Name'].reset_index()
women = athlete_df[athlete_df['Sex']=='F'].groupby('Year').count()['Name'].reset_index()
men
men.head()
men = athlete_df[athlete_df['Sex']=='M'].groupby('Year').count()['Name'].reset_index()
women = athlete_df[athlete_df['Sex']=='F'].groupby('Year').count()['Name'].reset_index()
men
final = men.merge(women,on='Year',how='left')
final.rename(columns={'Name_x':'Male','Name_y':'Female'},inplace = True)
final=final.fillna(0)
final
fig = px.line(final,x='Year',y=['Male','Female'])
fig.show()